#!/usr/bin/env python
# coding=utf-8

import os, sys, re, traceback, logging
import urllib
import time
import datetime
import getopt
import urllib2
from Route import *


vacation_info_re_pre = re.compile(r'''
		<line>(.*?)<\s*/\s*line>
		''', re.X|re.S)
xml_cdata_node_re = re.compile(r'''
		<(.*?)>\s*([\S\s]*?)</.*?>
		''', re.X|re.S)

def do_crawl():
    try:
		api_data = [
				('踏破铁鞋ￌﾤￆￆￌ￺￐ﾬ', 'http://www.tapotiexie.com/bb.xml', )
				]
		
		for vacation_type, server_url in api_data:
			# TODO
			#html, _ = curl_handle.open(server_url)
			html = urllib2.urlopen(server_url).read()
			vacation_info_list = vacation_info_re_pre.findall(html)
			for vacation_info in vacation_info_list:
				vacation_info_dict = dict(xml_cdata_node_re.findall(vacation_info))
				if not vacation_info_dict:
					print >>sys.stderr, 'cannot match xml_cdata_node_re'
					continue
				v = vacation_info_dict
				list = re.split("第\d+天",v.get('schedule', ''))
				i = 0
				for a in list:
					if i >= 1:
						print "第",i,"天:",a
					i+=1
				break
    except:
        print traceback.format_exc(sys.exc_info())

do_crawl()



